import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
pd.set_option('display.max_rows', None)
seattle_calendar = pd.read_csv('calendar.csv')
seattle_listings = pd.read_csv('listings.csv')
seattle_calendar.head()
| listing_id | date | available | price | |
|---|---|---|---|---|
| 0 | 241032 | 2016-01-04 | t | $85.00 |
| 1 | 241032 | 2016-01-05 | t | $85.00 |
| 2 | 241032 | 2016-01-06 | f | NaN |
| 3 | 241032 | 2016-01-07 | f | NaN |
| 4 | 241032 | 2016-01-08 | f | NaN |
seattle_calendar.shape
(1393570, 4)
seattle_calendar.isnull().sum()
listing_id 0 date 0 available 0 price 459028 dtype: int64
seattle_calendar.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1393570 entries, 0 to 1393569 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 listing_id 1393570 non-null int64 1 date 1393570 non-null object 2 available 1393570 non-null object 3 price 934542 non-null object dtypes: int64(1), object(3) memory usage: 42.5+ MB
Convert ['date'] column from object type to datetime type
seattle_calendar['date'] = pd.to_datetime(seattle_calendar['date'])
Remove the "$" , ".", "," signS from the ['price'] column and convert it to float
def format_price(col):
try:
value = float(col[1:])
except ValueError:
value = np.NaN
except TypeError:
value = np.NaN
return value
seattle_calendar['price'] = seattle_calendar['price'].apply(format_price)
seattle_calendar.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1393570 entries, 0 to 1393569 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 listing_id 1393570 non-null int64 1 date 1393570 non-null datetime64[ns] 2 available 1393570 non-null object 3 price 0 non-null float64 4 month 1393570 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(2), object(1) memory usage: 53.2+ MB
plt.figure(figsize=(18,8))
sns.barplot(x=seattle_calendar['date'], y= seattle_calendar['price'], data=seattle_calendar['available']=='t')
<AxesSubplot:xlabel='date', ylabel='price'>
seattle_calendar['month'] = seattle_calendar.date.dt.month
plt.figure(figsize=(15,8))
sns.pointplot(x = seattle_calendar['month'].unique(),
y= seattle_calendar.groupby(['month']).price.mean())
plt.axhline(seattle_calendar.price.mean(), linestyle='--', color='red')
<matplotlib.lines.Line2D at 0x7f9ae7b99c70>
We can see that the monthly average price peaks during the month of JULY and is usually high between MAY and SEPTEMBER
seattle_listings.head()
| id | listing_url | scrape_id | last_scraped | name | summary | space | description | experiences_offered | neighborhood_overview | ... | review_scores_value | requires_license | license | jurisdiction_names | instant_bookable | cancellation_policy | require_guest_profile_picture | require_guest_phone_verification | calculated_host_listings_count | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 241032 | https://www.airbnb.com/rooms/241032 | 20160104002432 | 2016-01-04 | Stylish Queen Anne Apartment | NaN | Make your self at home in this charming one-be... | Make your self at home in this charming one-be... | none | NaN | ... | 10.0 | f | NaN | WASHINGTON | f | moderate | f | f | 2 | 4.07 |
| 1 | 953595 | https://www.airbnb.com/rooms/953595 | 20160104002432 | 2016-01-04 | Bright & Airy Queen Anne Apartment | Chemically sensitive? We've removed the irrita... | Beautiful, hypoallergenic apartment in an extr... | Chemically sensitive? We've removed the irrita... | none | Queen Anne is a wonderful, truly functional vi... | ... | 10.0 | f | NaN | WASHINGTON | f | strict | t | t | 6 | 1.48 |
| 2 | 3308979 | https://www.airbnb.com/rooms/3308979 | 20160104002432 | 2016-01-04 | New Modern House-Amazing water view | New modern house built in 2013. Spectacular s... | Our house is modern, light and fresh with a wa... | New modern house built in 2013. Spectacular s... | none | Upper Queen Anne is a charming neighborhood fu... | ... | 10.0 | f | NaN | WASHINGTON | f | strict | f | f | 2 | 1.15 |
| 3 | 7421966 | https://www.airbnb.com/rooms/7421966 | 20160104002432 | 2016-01-04 | Queen Anne Chateau | A charming apartment that sits atop Queen Anne... | NaN | A charming apartment that sits atop Queen Anne... | none | NaN | ... | NaN | f | NaN | WASHINGTON | f | flexible | f | f | 1 | NaN |
| 4 | 278830 | https://www.airbnb.com/rooms/278830 | 20160104002432 | 2016-01-04 | Charming craftsman 3 bdm house | Cozy family craftman house in beautiful neighb... | Cozy family craftman house in beautiful neighb... | Cozy family craftman house in beautiful neighb... | none | We are in the beautiful neighborhood of Queen ... | ... | 9.0 | f | NaN | WASHINGTON | f | strict | f | f | 1 | 0.89 |
5 rows × 92 columns
seattle_listings.shape
(3818, 92)
seattle_listings.isna().sum()
id 0 listing_url 0 scrape_id 0 last_scraped 0 name 0 summary 177 space 569 description 0 experiences_offered 0 neighborhood_overview 1032 notes 1606 transit 934 thumbnail_url 320 medium_url 320 picture_url 0 xl_picture_url 320 host_id 0 host_url 0 host_name 2 host_since 2 host_location 8 host_about 859 host_response_time 523 host_response_rate 523 host_acceptance_rate 773 host_is_superhost 2 host_thumbnail_url 2 host_picture_url 2 host_neighbourhood 300 host_listings_count 2 host_total_listings_count 2 host_verifications 0 host_has_profile_pic 2 host_identity_verified 2 street 0 neighbourhood 416 neighbourhood_cleansed 0 neighbourhood_group_cleansed 0 city 0 state 0 zipcode 7 market 0 smart_location 0 country_code 0 country 0 latitude 0 longitude 0 is_location_exact 0 property_type 1 room_type 0 accommodates 0 bathrooms 16 bedrooms 6 beds 1 bed_type 0 amenities 0 square_feet 3721 price 0 weekly_price 1809 monthly_price 2301 security_deposit 1952 cleaning_fee 1030 guests_included 0 extra_people 0 minimum_nights 0 maximum_nights 0 calendar_updated 0 has_availability 0 availability_30 0 availability_60 0 availability_90 0 availability_365 0 calendar_last_scraped 0 number_of_reviews 0 first_review 627 last_review 627 review_scores_rating 647 review_scores_accuracy 658 review_scores_cleanliness 653 review_scores_checkin 658 review_scores_communication 651 review_scores_location 655 review_scores_value 656 requires_license 0 license 3818 jurisdiction_names 0 instant_bookable 0 cancellation_policy 0 require_guest_profile_picture 0 require_guest_phone_verification 0 calculated_host_listings_count 0 reviews_per_month 627 dtype: int64
Drop those 7 rows with missing zipcode values for our calculation
seattle_listings.dropna(subset=['zipcode'],how='all',axis=0, inplace=True)
seattle_listings['property_type'].value_counts()
House 1730 Apartment 1706 Townhouse 118 Condominium 91 Loft 40 Bed & Breakfast 36 Other 22 Cabin 21 Bungalow 13 Camper/RV 13 Boat 7 Tent 5 Treehouse 3 Dorm 2 Chalet 2 Yurt 1 Name: property_type, dtype: int64
px.pie(seattle_listings, values=seattle_listings['room_type'].value_counts(),
names=seattle_listings['room_type'].unique(),hole=0.5, title='Room Type')
fig = px.density_mapbox(seattle_listings, lat='latitude', lon='longitude',
radius=5,center=dict(lat=0, lon=180), zoom=0,labels='zipcode',
mapbox_style="stamen-terrain")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
Map is tilted 90 degrees to the left
px.pie(seattle_listings, values= seattle_listings.groupby(['zipcode']).id.count(),
names=sorted(seattle_listings['zipcode'].unique()),hole=0.4,title='Zipcode with most listings')
They combine to form more than 30% of the AIRBNB listings in Seattle
small_data = seattle_listings[['neighbourhood_group_cleansed','zipcode']]
most_listed_zipcodes = small_data[(small_data['zipcode'] =='98122') |
(small_data['zipcode'] == '98103') |
(small_data['zipcode'] =='98102')]['neighbourhood_group_cleansed'].unique()
most_listed_zipcodes
array(['Other neighborhoods', 'Central Area', 'University District',
'Cascade', 'Downtown', 'Beacon Hill', 'Capitol Hill', 'Northgate'],
dtype=object)
Convert price to float format
seattle_listings['price'] = seattle_listings['price'].apply(format_price)
Replace "99\n98122" to "98122" as it's a typo
seattle_listings.zipcode.replace("99\n98122","98122",inplace=True)
mean_price = seattle_listings.price.mean()
print("The Average price of listings is: {}".format(mean_price))
The Average price of listings is: 127.68608923884514
a = seattle_listings.groupby(['zipcode']).price.mean()
a[a > mean_price].sort_values(ascending=False)
zipcode 98134 206.600000 98199 172.393939 98101 166.721393 98119 166.377622 98121 153.785714 98109 150.247525 98116 145.348214 98136 137.909091 98112 135.437126 98104 135.221053 98122 132.337292 98107 130.238889 Name: price, dtype: float64
Neighbourhoods with the most expensive listing prices in Seattle (higher than mean price $ 127) are shown above
costly_zipcodes = small_data[(small_data['zipcode'] =='98134') |
(small_data['zipcode'] == '98199') |
(small_data['zipcode'] =='98101')]['neighbourhood_group_cleansed'].unique()
costly_neighbourhood = pd.Series(costly_zipcodes)
costly_neighbourhood[1:]
1 Downtown 2 Magnolia 3 Interbay 4 Other neighborhoods 5 Capitol Hill dtype: object
a[a < mean_price].sort_values()
zipcode 98106 82.103448 98108 83.875000 98133 86.282609 98125 88.577465 98118 93.798701 98178 94.428571 98146 97.800000 98177 98.842105 98144 105.395000 98105 107.111111 98115 107.112994 98117 111.262195 98126 122.730159 98103 124.149746 98102 127.103245 Name: price, dtype: float64
Neighbourhoods with affordable prices (lower than mean price $ 127) are shown above
cheaper_zipcodes = small_data[(small_data['zipcode'] =='98106') |
(small_data['zipcode'] == '98108') |
(small_data['zipcode'] =='98133')]['neighbourhood_group_cleansed'].unique()
cheaper_neighbourhood = pd.Series(cheaper_zipcodes)
cheaper_neighbourhood[1:]
1 Beacon Hill 2 Delridge 3 Rainier Valley 4 Northgate dtype: object
Considering few important features
df = seattle_listings.loc[:,('id','host_response_time','host_response_rate','accommodates',
'bathrooms','bedrooms','beds','price','weekly_price','monthly_price',
'cleaning_fee','minimum_nights','review_scores_rating','instant_bookable')]
df.head()
| id | host_response_time | host_response_rate | accommodates | bathrooms | bedrooms | beds | price | weekly_price | monthly_price | cleaning_fee | minimum_nights | review_scores_rating | instant_bookable | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 241032 | within a few hours | 96% | 4 | 1.0 | 1.0 | 1.0 | 85.0 | NaN | NaN | NaN | 1 | 95.0 | f |
| 1 | 953595 | within an hour | 98% | 4 | 1.0 | 1.0 | 1.0 | 150.0 | $1,000.00 | $3,000.00 | $40.00 | 2 | 96.0 | f |
| 2 | 3308979 | within a few hours | 67% | 11 | 4.5 | 5.0 | 7.0 | 975.0 | NaN | NaN | $300.00 | 4 | 97.0 | f |
| 3 | 7421966 | NaN | NaN | 3 | 1.0 | 0.0 | 2.0 | 100.0 | $650.00 | $2,300.00 | NaN | 1 | NaN | f |
| 4 | 278830 | within an hour | 100% | 6 | 2.0 | 3.0 | 3.0 | 450.0 | NaN | NaN | $125.00 | 1 | 92.0 | f |
df.shape
(3811, 14)
df.dtypes
id int64 host_response_time object host_response_rate object accommodates int64 bathrooms float64 bedrooms float64 beds float64 price float64 weekly_price object monthly_price object cleaning_fee object minimum_nights int64 review_scores_rating float64 instant_bookable object dtype: object
def remove_ntile(x):
if type(x) is str:
return float(x.strip('%'))/100
return 1
df['host_response_rate'] = df.host_response_rate.apply(remove_ntile)
df['host_response_rate'] = df['host_response_rate'].astype(float)
df.describe()
| id | host_response_rate | accommodates | bathrooms | bedrooms | beds | price | minimum_nights | review_scores_rating | |
|---|---|---|---|---|---|---|---|---|---|
| count | 3.811000e+03 | 3811.000000 | 3811.000000 | 3795.000000 | 3805.000000 | 3810.000000 | 3810.000000 | 3811.000000 | 3165.000000 |
| mean | 5.552586e+06 | 0.955799 | 3.347678 | 1.258893 | 1.307490 | 1.735433 | 127.686089 | 2.371031 | 94.540916 |
| std | 2.961556e+06 | 0.111720 | 1.972802 | 0.589932 | 0.882151 | 1.139440 | 89.099100 | 16.320811 | 6.609669 |
| min | 3.335000e+03 | 0.170000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 20.000000 | 1.000000 | 20.000000 |
| 25% | 3.260828e+06 | 1.000000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 75.000000 | 1.000000 | 93.000000 |
| 50% | 6.118359e+06 | 1.000000 | 3.000000 | 1.000000 | 1.000000 | 1.000000 | 100.000000 | 2.000000 | 96.000000 |
| 75% | 8.035042e+06 | 1.000000 | 4.000000 | 1.000000 | 2.000000 | 2.000000 | 150.000000 | 2.000000 | 99.000000 |
| max | 1.034016e+07 | 1.000000 | 16.000000 | 8.000000 | 7.000000 | 15.000000 | 999.000000 | 1000.000000 | 100.000000 |
plt.figure(figsize=(15,7))
sns.heatmap(df.corr(), annot = True, fmt = '.2f', cmap = 'Blues')
<AxesSubplot:>
Due to restricts on project complexity we focused only on continous values
Occupancy
plt.figure(figsize=(15,6))
plt.title('Listing Occupancy')
sns.countplot(data = seattle_calendar, x = seattle_calendar['date'].dt.month,
hue = 'available',edgecolor=sns.color_palette("dark", 3))
<AxesSubplot:title={'center':'Listing Occupancy'}, xlabel='date', ylabel='count'>